In [2]:
import warnings
warnings.filterwarnings('ignore')

import json
import urllib3
import time
import urllib.request
import pandas as pd
from pandas.io.json import json_normalize
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0
plt.rcParams['savefig.format'] = 'pdf'
plt.rcParams['legend.frameon'] = True

#pd.set_option('display.max_rows', 50)
#pd.set_option('display.max_columns', 50)

import seaborn as sns
sns.set_context('notebook')
sns.set_style('whitegrid')
sns.set_palette('deep')

Basic network statistics

Firstly we look at the number of nodes, number of edges and average degree for each window size for the whole period. Note that "number of nodes" means number of individuals who were involved in at least one interaction within the relevant period.

In [5]:
def fullPlot(toPlot,title,x,y,scale,start,end):
    windows = [31536000000,2592000000,604800000, 86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title,fontsize=30)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    ax.set_yscale(scale)

    with open('degree/degrees.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
        x = x[x[toPlot] != 0]
        x.plot(x='time', y=toPlot,ax=ax, label="Aggregate Graph")

    with open('degree/degreewindows.json') as json_file:
        with open('degree/degreesorted.json') as json_2:
            cc1 = json.load(json_file)
            cc2 = json.load(json_2)
            cc1= pd.DataFrame(cc1['views'])
            cc2= pd.DataFrame(cc2['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc2.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc2=cc2[start:end]
            index = 0
            for i in windows:
                color=next(ax._get_lines.prop_cycler)['color']
                y = cc1[cc1['windowsize'] == i]
                z = cc2[cc2['windowsize'] == i]
                y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
                z['avgdeg']= np.where(z['vertices']<1, z['vertices'], 2*z['edges']/z['vertices'])
                #y = y[y[toPlot] != 0]
                y.plot(x='time', y=toPlot,ax=ax, label=labels[index], color = color)
                #z.plot(x='time', y=toPlot,ax=ax, label='_nolegend_', color = color, linestyle="--", alpha=0.8)
                index +=1

    with open('degree/degreehours.json') as json_file:
        with open('degree/degreesortedhour.json') as json_2:
            color=next(ax._get_lines.prop_cycler)['color']
            x = json.load(json_file)
            y = json.load(json_2)
            x= pd.DataFrame(x['views'])
            y= pd.DataFrame(y['views'])
            x['index'] = pd.to_datetime(x['time'],unit='ms')
            y['index'] = pd.to_datetime(y['time'],unit='ms')
            x['time'] = pd.to_datetime(x['time'],unit='ms')
            y['time'] = pd.to_datetime(y['time'],unit='ms')
            x.set_index('index', inplace=True)
            y.set_index('index', inplace=True)
            x =x[start:end]
            y =y[start:end]
            x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
            y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
            x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
            y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
            x = x[x[toPlot] != 0]
            y = y[y[toPlot] != 0]
            x.plot(x='time', y=toPlot,ax=ax, color=color, label="Hour window")
            #y.plot(x='time', y=toPlot,ax=ax, color=color, label='_nolegend_',linestyle="--", alpha=0.8)
        
    #plt.axvline('2016-11-09',linestyle='')
    #plt.axvline('2017-08-11')
    #plt.axvline('2017-08-13')
    plt.legend(fontsize=20, loc='upper left')
    plt.xlabel('Date',fontsize=30)
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.savefig('graphs/'+toPlot+'.png')
    plt.show()
In [4]:
def plot_window_scale(toPlot, title, y, no_parts=3):
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title,fontsize=30)
    plt.xlabel("Window Size",fontsize=30)
    plt.ylabel(y,fontsize=30)
    ax.set_xscale('log')
    ax.set_yscale('log')
    windows = np.array([3600000, 86400000, 604800000, 2592000000, 31536000000])
    means = np.zeros((no_parts,5),dtype=float)
    sds = np.zeros((no_parts,5),dtype=float)

    with open('degree/degreewindows.json') as json_file:            
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
        x = x[x[toPlot] != 0]
        for j in range(1,5):
            z = x[x['windowsize']==round(windows[j])]
            dfs = np.array_split(z,no_parts)
            for i in range(no_parts):
                df = dfs[i]
                means[i,j]=df[toPlot].mean()
                sds[i,j]=df[toPlot].std()
                
    with open('degree/degreehours.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
        x = x[x[toPlot] != 0]
        dfs = np.array_split(x,no_parts)
        for i in range(no_parts):
            df = dfs[i]
            means[i,0]=df[toPlot].mean()
            sds[i,0]=df[toPlot].std()
        
    for i in range(no_parts):
        plt.plot(windows, means[i], label = "Period "+str(i+1),marker='^')
        plt.fill_between(windows, means[i]-1.96*sds[i], means[i]+1.96*sds[i], alpha = 0.3)
    
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.legend(loc='upper left',fontsize=20)
    plt.show()

Number of nodes

First we examine how the number of nodes is reported according to each window size. We see that:

  • The values for the year and the month window diverge by huge scales. This suggests that there is an important distinction to be made about reported "total" number of users vs "active" number of users for social networks.
  • On medium scales of a month and week, we see 'bumps' following news events of interest to the Gab community such as the Trump election in Nov 16 and Charlottesville "Unite the Right" rally in Aug 17, which seem to be obscured in larger timescales and just noise in smaller scales. Perhaps weekly/monthly windows are a good candidate size for time series anomaly detection tasks?
In [6]:
fullPlot('vertices','Total Number of Vertices','Date','Number of Vertices','linear','2016-09-30 23:00:00','2018-10-30')
In [85]:
plot_window_scale('vertices', "Mean number of Nodes", "Mean number of Nodes", 5)

Number of edges

Not much to say here apart from, again, the hugely differing scales of looking at a month vs a year.

In [9]:
fullPlot('edges', 'Total Number of Edges', 'Date', 'Edges', 'linear','2016-09-30 23:00:00','2018-10-30')
In [84]:
plot_window_scale('edges', "Mean number of Edges", "Mean number of Edges", 5)

Average degree

Calculated as 2x|edges|/|vertices|. We see that:

  • For medium sized windows this is largely varying, with the 'dips' being caused by the bursts in number of vertices.
  • For daily and hourly windows, it seems to be just constant + random noise (or constant+seasonal, I haven't looked into it yet.) It just seems interesting to me that there's seemingly no trend for the hour one compared to other sizes.
In [11]:
fullPlot('avgdeg', 'Average Degree', 'Date', 'Average Degree', 'linear','2016-09-30 23:00:00','2018-10-30')
In [12]:
plot_window_scale('avgdeg', "Average Degree", "Average Degree", 5)
In [81]:
from matplotlib import gridspec

def plotNewVsExisting(toPlot,start,end,y,scale):
    windows = [31536000000,2592000000,604800000, 86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig = plt.figure(figsize=a4_dims)
    gs = gridspec.GridSpec(2, 1, height_ratios=[1, 2])
    ax0 = fig.add_subplot(gs[0])
    ax0.set_ylim((0,1))
    ax1 = fig.add_subplot(gs[1],sharex=ax0)
    plt.xlabel("Date",fontsize=30)
    plt.ylabel(y,fontsize=20)
    ax0.set_yscale(scale)

    with open('degree/degrees.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        aggr =x[start:end]
        aggr.plot(x='time', y=toPlot,ax=ax1, label="Aggregate Graph")

    with open('degree/degreewindows.json') as json_file:
        with open('degree/degreesorted.json') as json_2:
            cc1 = json.load(json_file)
            cc2 = json.load(json_2)
            cc1= pd.DataFrame(cc1['views'])
            cc2= pd.DataFrame(cc2['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc2.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc2=cc2[start:end]
            index = 0
            for i in windows:
                diff_size = int(i/86400000);
                color=next(ax1._get_lines.prop_cycler)['color']
                y = cc1[cc1['windowsize'] == i]
                z = cc2[cc2['windowsize'] == i]
                y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
                y['new']= aggr[toPlot].diff(diff_size)
                y['prop_new'] = np.where(y[toPlot]<1,1,y['new']/y[toPlot])
                y = y[y[toPlot] != 0]
                y.plot(x='time', y=toPlot,ax=ax1, label=labels[index], color = color)
                y.plot(x='time', y="prop_new",ax=ax0, label=labels[index], color = color)
                #z.plot(x='time', y=toPlot,ax=ax, label='_nolegend_', color = color, linestyle="--", alpha=0.8)
                index +=1

    with open('degree/degreehours.json') as json_file:
        with open('degree/degreesortedhour.json') as json_2:
            color=next(ax1._get_lines.prop_cycler)['color']
            x = json.load(json_file)
            y = json.load(json_2)
            x= pd.DataFrame(x['views'])
            y= pd.DataFrame(y['views'])
            x['index'] = pd.to_datetime(x['time'],unit='ms')
            y['index'] = pd.to_datetime(y['time'],unit='ms')
            x['time'] = pd.to_datetime(x['time'],unit='ms')
            y['time'] = pd.to_datetime(y['time'],unit='ms')
            x.set_index('index', inplace=True)
            y.set_index('index', inplace=True)
            x =x[start:end]
            y =y[start:end]
            #x['prop_new'] = np.where(x[toPlot]<1,1,x['new']/x[toPlot])
            x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
            y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
            x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
            y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
            x = x[x[toPlot] != 0]
            y = y[y[toPlot] != 0]
            x.plot(x='time', y=toPlot,ax=ax1, color=color, label="Hour window")
            #x.plot(x='time', y="prop_new",ax=ax0, color=color, label="Hour window")
            #y.plot(x='time', y=toPlot,ax=ax, color=color, label='_nolegend_',linestyle="--", alpha=0.8)
        
    #plt.axvline('2016-11-09',linestyle='')
    #plt.axvline('2017-08-11')
    #plt.axvline('2017-08-13')
    ax0.get_legend().remove()
    ax0.set_title("Proportion of "+toPlot+" that are new", fontsize=20)
    ax1.legend(fontsize=15, loc='upper left')
    plt.xlabel('Date',fontsize=30)
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.savefig('graphs/'+toPlot+'.png')
    plt.show()
In [82]:
plotNewVsExisting("vertices","2016-09-30 23:00:00","2018-10-30","Number of Vertices","linear")
In [83]:
plotNewVsExisting("edges","2016-09-30 23:00:00","2018-10-30","Number of Edges","linear")

Connected Components Analysis

This section contains the analysis of the size/proportion of the largest connected components, as well as the number of connected components. In the proportion and number, we exclude components comprising just one edge (two nodes) from the calculations.

In [48]:
def fullPlot(toPlot,title,x,y,scale,start,end):
    windows = [31536000000,2592000000,604800000,86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title,fontsize=30)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    ax.set_yscale(scale)

    with open('CC/nowindow.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        x = x[x['total'] != 0]
        x['mean'] = x[toPlot].rolling(window=4,center=False).mean()
        x.plot(x='time', y=toPlot,ax=ax, label="Aggregate Graph")

    with open('CC/bigCC.json') as json_file:
        cc1 = json.load(json_file)
        cc1= pd.DataFrame(cc1['views'])
        cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1.set_index('index', inplace=True)
        cc1=cc1[start:end]
        cc1 = cc1[cc1[toPlot] != 0]
        index = 0
        for i in windows:
            y = cc1[cc1['windowsize'] == i]
            y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
            hours = str(int((i/3600000)))+" hour window"
            y.plot(x='time', y=toPlot,ax=ax, label=labels[index])
            index +=1

    with open('CC/cc1hour.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        #x = x[x['hour'].isin([4,16])]
        x.plot(x='time', y=toPlot,ax=ax, color="#f0134d", label="Hour window")
    ax.legend(fontsize=20,framealpha=0.9,loc='upper left')
    plt.tight_layout()
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
    plt.xlabel("Date")
    plt.savefig("Graphs/"+toPlot+"partial.png")
    plt.show()

    
partial_dates = ['2017-09-30 23:00:00','2017-10-30']
full_dates = ['2016-09-30 23:00:00','2018-10-30']
#x['mean'] = x[toPlot].rolling(window=24,center=False).mean()
#[0,4,8,12,16,20,24]
#'2016-07-01': '2018-05-03'

Proportion (without isolated nodes)

We measure the size of the largest connected component as a proportion of the size of the whole graph for that window.

Whole time period

We find that:

  • Just a day-length window is large enough to observe a connected component that is always more than 90% of the graph.
  • Hourly window is a total mess, look into this later.
In [49]:
fullPlot('proportionWithoutIslands','Largest Connected Component % of Graph','Date','Proportion of Graph','linear','2016-09-30 23:00:00','2018-10-30')

Zoomed in

When zooming in to just a month subset of data, we see that the hourly window size is actually showing diurnal behaviour which we will explore later.

In [50]:
fullPlot('proportionWithoutIslands','Largest Connected Component % of Graph','Date','Proportion of Graph','linear','2017-09-30 23:00:00','2017-10-30')

Size of the largest connected component

On the whole, this shows similar behaviour to just the number of vertices.

In [51]:
fullPlot('biggest','Largest Connected Component Size','Date','Largest connected Component','linear','2016-09-30 23:00:00','2018-10-30')

Number of connected components

This seems to be similar in trend to the size of the largest (note the much smaller scale of ~600 components in total).

In [52]:
fullPlot('totalWithoutIslands','Number of Connected Components','Date','Total Connected Components','linear','2016-09-30 23:00:00','2018-10-30')

At what size does the giant component break?

The plot below shows a CDF of the 'proportion' data for different window sizes, with particular attention to sizes between an hour and a day. Would be maybe helpful to provide a takeaway statistic like "for all window sizes, there's a component of size x% for y% of the time.

In [14]:
def proportionWindowCDFPlot(toPlot,title,x,y,scale,start,end):
    num_bins = 100
    
    windows = [31536000000,2592000000,604800000,86400000]
    windows2 = [43200000,21600000,14400000,7200000]
    labels = ["Year Window",'Month Window','Week Window','Day Window']
    labels2 = ["12 Hour Window","6 Hour Window","4 Hour Window","2 Hour Window"]
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    plt.title(title,fontsize=20)
    plt.xlabel(x,fontsize=16)
    plt.ylabel(y,fontsize=16)
    ax.set_yscale(scale)

    with open('CC/nowindow.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        x = x[x['total'] != 0]
        x['mean'] = x[toPlot].rolling(window=4,center=False).mean()
        counts, bin_edges = np.histogram (x[toPlot], bins=num_bins, normed=False)
        cdf = np.cumsum (counts)
        l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
        l.set_label("No window")

    with open('CC/bigCC.json') as json_file:
        cc1 = json.load(json_file)
        cc1= pd.DataFrame(cc1['views'])
        cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1.set_index('index', inplace=True)
        cc1=cc1[start:end]
        cc1 = cc1[cc1[toPlot] != 0]
        index = 0
        for i in windows:
            y = cc1[cc1['windowsize'] == i]
            y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
            hours = str(int((i/3600000)))+" hour window"
            counts, bin_edges = np.histogram (y[toPlot], bins=num_bins, normed=False)
            cdf = np.cumsum (counts)
            l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
            l.set_label(labels[index])
            index +=1

    with open('CC/ccwindowsecondset.json') as json_file:
        cc1 = json.load(json_file)
        cc1= pd.DataFrame(cc1['views'])
        cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
        cc1.set_index('index', inplace=True)
        cc1=cc1[start:end]
        cc1 = cc1[cc1[toPlot] != 0]
        index = 0
        for i in windows2:
            y = cc1[cc1['windowsize'] == i]
            y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
            hours = str(int((i/3600000)))+" hour window"
            counts, bin_edges = np.histogram (y[toPlot], bins=num_bins, normed=False)
            cdf = np.cumsum (counts)
            l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
            l.set_label(labels2[index])
            index +=1

    with open('CC/cc1hour.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        x = x[x['total'] != 0]
        x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
        counts, bin_edges = np.histogram (x[toPlot], bins=num_bins, normed=False)
        cdf = np.cumsum (counts)
        plt.plot(bin_edges[1:], cdf/cdf[-1],label="Hour window")
        #x.plot(x='time', y=toPlot,ax=ax, color="#f0134d",)
    ax.legend(fontsize=16)
    plt.savefig("Graphs/ConnectedComponentsCDF.png")
    plt.show()



proportionWindowCDFPlot('proportion','Biggest Connected Components % of graph','Proportion of the graph', 
                        'Proportion of Time','linear','2016-09-01', '2018-05-03')

Diurnal activity as shown by the hourly window

Batching the data by hour of the day we can see some diurnal behaviour as it's a mostly US-based platform.

Initial box plot

Seaborns documentation: parameter whis controlling how far whiskers stretch. Upper whisker stretches to the furthest datapoint within [UQ, UQ + whis*IQR]

In [25]:
def diurnal_plot(toPlot, title, x, y, scale,start,end):
    
    windows = [31536000000,2592000000,604800000,86400000]
    #start = 1483228800000
    no_weeks = 10
    
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(1,1,figsize=a4_dims)
    #ax[0].set_title(title,fontsize=20)
    #ax.set_title("Boxplot of "+title,fontsize=30)
    #ax[0].set_xlabel(x,fontsize=20)
    ax.set_xlabel(x,fontsize=30)
    #ax[0].set_ylabel(y,fontsize=20)
    ax.set_ylabel(y,fontsize=30)
    #ax[0].set_yscale(scale)
    ax.set_yscale(scale)
    
    dfs = {}
    
    with open('CC/cc1hour.json') as json_file:
        x = json.load(json_file)
        x= pd.DataFrame(x['views'])
        x['index'] = pd.to_datetime(x['time'],unit='ms')
        x['time'] = pd.to_datetime(x['time'],unit='ms')
        x.set_index('index', inplace=True)
        x =x[start:end]
        x = x.set_index('time')
        x['Weekday Name'] = x.index.weekday_name
        x['Hour'] = x.index.hour
    #print(x.dtypes)
        x = x[x['total'] != 0]
        means = x.groupby(x.index.hour).mean()
        sds = x.groupby(x.index.hour).std()
        #ax[0].plot(means['Hour'],means[toPlot])
        #ax[0].fill_between(means['Hour'], means[toPlot]-sds[toPlot]/2.0, means[toPlot]+sds[toPlot]/2.0, alpha=0.3)
        
        ax=sns.boxplot(data=x, x='Hour', y=toPlot)
        plt.xlabel("Hour (UTC)")
        plt.rc('xtick',labelsize=20)
        plt.rc('ytick',labelsize=20)
        plt.ylabel(y,fontsize=30)
        plt.tight_layout()
        plt.savefig("Graphs/"+y.replace(" ","_")+".png")
        plt.show()
        
diurnal_plot('proportion','Largest Connected Component % of graph','Hour','Proportion of Graph','linear','2016-11-30', '2018-10-30')
diurnal_plot('biggest','Largest Connected Component Size','Hour','Largest connected Component','linear','2016-11-30','2018-10-30')
diurnal_plot('totalWithoutIslands','Total Connected Components','Hour','Total Connected Components','linear','2016-11-30','2018-10-30')

Time-series analysis of the LCC in the hour window

This section explores deeper the existence of periodic/diurnal behaviour in the size/proportion of giant component size using fourier analysis. We will look first at the proportion data.

The fourier transform and power spectrum of the shows peaks at the 24hr, 12hr, and 6hr frequencies.

In [3]:
# Look at power spectrum of proportion data
from scipy import fftpack, signal


with open('CC/cc1hour.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    json_file.close()
    
data_prop = signal.detrend(np.array(x['proportion']),)

# plot original data (shifted by mean)

plt.plot(range(len(data_prop)),data_prop)
plt.title('Plot of original data')
plt.show()

# get fourier series
fs = 168
x = fftpack.fft(data_prop)
# x is a vector of complex number eigenvalues
freqs = fftpack.fftfreq(len(data_prop))*fs

fig, ax = plt.subplots()

ax.stem(freqs, np.abs(x))
ax.set_xlabel('Frequency (times per '+str(fs)+'hrs)')
ax.set_xlim(0,0.5*fs)
ax.set_ylabel('Frequency Domain (Spectrum) Magnitude')
plt.title('Fourier transform of hourly data')
plt.tight_layout()
plt.show()
In [180]:
# Compute power spectrum for clearer picture
freqs, P_xx = signal.periodogram(data_prop, fs, scaling = 'density')
plt.plot(freqs, P_xx)
plt.title('Power spectrum of data')
plt.show()

Below is the inverse Fourier transform (IFT) of the first few frequencies, showing the 'hum'.

In [181]:
# Compute inverse fourier transform of first few frequencies.

tmp = np.zeros(len(x))
tmp[:20]=x[:20]
IFT = fftpack.ifft(tmp)
plt.plot(range(len(data_prop)),data_prop, color='black')
plt.plot(range(len(data_prop)),IFT,color='red')
plt.show()

The IFT of the largest magnitude 10 frequencies looks fairly as expected, looking like a sine wave with 24h period.

In [182]:
#First 10 harmonics against real data for 1 week

ix_full = np.argsort(-1*np.absolute(x))
res_full = np.absolute(x[ix_full])

tmp = np.zeros(len(x),dtype=np.complex)
tmp[ix_full[:10]]=x[ix_full[:10]]
res1 = np.fft.ifft(tmp)

plt.figure(figsize=(8,5))
plt.plot(range(168),data_prop[1000:1168],color='black')
plt.plot(range(168),res1[1000:1168],color='red',linewidth=3)
plt.show()

If we allow the IFT of the largest 240 harmonics we actually see a second slightly smaller peak slightly before a bigger peak, suggesting the presence of a European userbase.

In [183]:
#First 240 harmonics

tmp = np.zeros(len(x), dtype=np.complex)
tmp[ix_full[:240]]=x[ix_full[:240]]
res2 = np.fft.ifft(tmp)

plt.plot(data_prop[1000:1168],color='black')
plt.plot(np.real(res2[1000:1168]),color='red',linewidth=3)
plt.show()

The following plot is the red line in the previous plot plotted against a 4hr behind version of itself -- it doesn't really add anything but I just want to keep the picture for it somewhere as it's quite pretty!

In [187]:
# Look at the 4-lagged version of cleaned signal
res1_lagged = np.roll(np.real(res1),4)

plt.plot(np.real(res1), res1_lagged, linewidth=0.1)
plt.show()

This following subsection definitely needs more attention from me but is essentially doing the same process but for the absolute size of the LCC, and shows that if we look at the absolute size rather than proportion, we see not just a 24hr frequency but a weekly frequency (component is smaller at weekends)/

In [28]:
# Let's do some the same process but this time with the "size of LCC" data

with open('CC/cc1hour.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    json_file.close()
    
data_biggest = signal.detrend(np.array(x['biggest']))
In [29]:
plt.plot(range(len(data_biggest)),data_biggest)
plt.title('Plot of original data (detrended)')
plt.show()

# get fourier series
fs = 168
y = fftpack.fft(data_biggest)
# x is a vector of complex number eigenvalues
freqs = fftpack.fftfreq(len(data_biggest))*fs

fig, ax = plt.subplots()

ax.stem(freqs, np.abs(y))
ax.set_xlabel('Frequency (times per '+str(fs)+'hrs)')
ax.set_xlim(0,0.5*fs)
ax.set_ylabel('Frequency Domain (Spectrum) Magnitude')
plt.title('Fourier transform of hourly data')
plt.tight_layout()
plt.show()
In [44]:
# Compute power spectrum for clearer picture
freqs, P_xx = signal.periodogram(data_biggest, fs, scaling = 'density')
plt.plot(freqs, P_xx)
plt.title('Power spectrum of data')
plt.show()
In [34]:
# Compute inverse fourier transform of first few frequencies.

tmp = np.zeros(len(y))
tmp[:20]=y[:20]
IFT = fftpack.ifft(tmp)
plt.plot(range(len(data_biggest)),data_biggest, color='black')
plt.plot(range(len(data_biggest)),IFT,color='red')
plt.show()
In [41]:
#First 10 harmonics against real data for 1 week

ix_full = np.argsort(-1*np.absolute(y))
res_full = np.absolute(y[ix_full])

tmp = np.zeros(len(y),dtype=np.complex)
tmp[ix_full[:50]]=y[ix_full[:50]]
res1 = np.fft.ifft(tmp)

plt.figure(figsize=(8,5))
plt.plot(range(168),data_biggest[1000:1168],color='black')
plt.plot(range(168),res1[1000:1168],color='red',linewidth=3)
plt.show()
In [36]:
#First 240 harmonics

tmp = np.zeros(len(y), dtype=np.complex)
tmp[ix_full[:240]]=y[ix_full[:240]]
res2 = np.fft.ifft(tmp)

plt.plot(data_biggest[10000:10720],color='black')
plt.plot(np.real(res2[10000:10720]),color='red',linewidth=2)
plt.show()
In [45]:
# See if lagged series without harmonics is still correlated.

without_harmonics = data_biggest - np.real(res2)

plt.stem(range(24), acf(without_harmonics,24))
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-2e095fd21752> in <module>()
      3 without_harmonics = data_biggest - np.real(res2)
      4 
----> 5 plt.stem(range(24), acf(without_harmonics,24))
      6 plt.show()

NameError: name 'acf' is not defined
In [200]:
# Look at the 4-lagged version of cleaned signal
res1_lagged = np.roll(np.real(res2),4)

plt.plot(np.real(res2), res1_lagged, linewidth=0.1)
plt.show()

Comparison with shuffled timestamps null model

Keep the same links but reorder their timestamps randomly, so that the rate of edge activity is conserved and that the aggregate graph is identical. For more detail on this null model, take a look at Temporal Networks, P. Holme, J. Saramaki (2011) under the heading Randomly Permuted Times (p17).

Proportion

We find that the value for the proportion is always smaller in the shuffled timestamps case than the real data, only slightly so for window sizes greater than a day, but largely so for the hour window. My thoughts are that this is due to the 'memory effect'/'edge persistence' in the real data, i.e. pairwise interactions are fairly bursty, and the chance of an interaction between two users decreases the longer it's been since the last interaction. In this way, we might expect to see a higher number of unique nodes by randomly sampling a number of edges throughout the whole time period than sampling the same number of edges but within a small time slice (i.e. a larger denominator in the 'proportion' for the shuffled than for unshuffled).

In [13]:
def nullComparePlot(toPlot,x,y,scale,start,end,window,lims):
    windows = {"Year": 31536000000,"Month": 2592000000, "Week": 604800000, "Day": 86400000}
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title,fontsize=30)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    ax.set_yscale(scale)
    plt.ylim(lims)
    
    if window=="Hour":
        with open('CC/cc1hour.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            cc1.plot(x='time', y=toPlot,ax=ax, label="Real Data")
    
        with open('CC/sortedCChour.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            cc1.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
            
    elif window =="4 Hour":
        with open('CC/ccwindowsecondset.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==14400000]
            y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
            
        with open('CC/sortedhourslotcc.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==14400000]
            y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
            
    elif window =="6 Hour":
        with open('CC/ccwindowsecondset.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==21600000]
            y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
            
        with open('CC/sortedhourslotcc.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==21600000]
            y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
            
    elif window =="12 Hour":
        with open('CC/ccwindowsecondset.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==43200000]
            y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
            
        with open('CC/sortedhourslotcc.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize']==43200000]
            y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
            
    else:
        with open('CC/bigCC.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize'] == windows[window]]
            y.plot(x='time', y=toPlot,ax=ax, label="Real Data")

        with open('CC/sortedCC.json') as json_file:
            cc1 = json.load(json_file)
            cc1= pd.DataFrame(cc1['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc1 = cc1[cc1[toPlot] != 0]
            y = cc1[cc1['windowsize'] == windows[window]]
            y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
        
    ax.legend(fontsize=20,framealpha=0.9,loc='lower right')
    plt.tight_layout()
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
    plt.xlabel("Date")

Proportion: Hour window

In [53]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','Hour',[0.0,1.0])

Proportion: 4 Hour window

In [6]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','4 Hour',[0.0,1.0])

Proportion 6 Hour

In [11]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','6 Hour',[0.0,1.0])

Proportion: Day window

In [18]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Day',[0.6,1.0])

Proportion: Week Window

In [19]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Week',[0.6,1.0])

Proportion: Month window

In [20]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Month',[0.6,1.0])

Proportion: Year window

In [21]:
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Year',[0.6,1.0])

Size of the largest connected component

For this part we instead plot the absolute size of the largest connected component for different windows. Like the proportion, the LCC size is smaller for the shuffled data than the real in the hour window, but confusingly this order switches going up to the day, week and month windows.

For the day, week and month window, I suspect that 'memory effect' might explain this too, in that you're more likely to sample 'weak ties' if sampling from the whole time period as opposed to the same number but from a small time interval.

Raw size of LCC: Hour window

In [54]:
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','Hour',[0,450])

Raw size of LCC: 4 Hour Window

In [16]:
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','4 Hour',[0,1000])
In [15]:
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','6 Hour',[0,1500])
In [17]:
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','12 Hour',[0,2000])

Raw size of LCC: Day window

In [23]:
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Day',[0,10000])

Raw size of LCC: Week window

In [24]:
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Week',[0,50000])

Raw size of LCC: Month window

In [25]:
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Month',[0,50000])

Raw size of LCC: Year window

In [26]:
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Year',[0,100000])

Comparison with Erdos-Renyi Reference model

We compare also (on a smaller time interval because of time constraints!) the size of the LCC for the real data with that of an Erdos-Renyi random graph with the same number of nodes and edges.

Specifically we generate, for each window size and time, a graph with the same number of nodes and edges as the real data, with the edges assigned at random, and compare the size of the LCC of this with the real data.

For each window size, the LCC is consistently overestimated by the E-R model, which may be due to some strong underlying community structure, whereby the second, third etc largest connected components in the real data are non-negligible in size.

Hour window

In [59]:
# Size of largest connected component compared to expected in an E-R Graph

a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)

import networkx as nx

def get_expected_size(n,m):
    sizes=np.zeros(5)
    for i in range(5):
        G = nx.gnm_random_graph(n, m)
        largest = max(nx.connected_component_subgraphs(G), key=len)
        sizes[i]=len(largest)
    return np.mean(sizes)

start, end = '2017-9-23 23:00:00','2017-09-30 23:00:00'

with open('degree/degreehours.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x['mean'] =x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
    #plt.fill_between(x['time'],np.array(x['mean'])-np.array(x['sd']), np.array(x['mean'])+np.array(x['sd']),alpha=0.3)
    #print(x['expected'])
    ax.plot(x['time'],x['mean'],label='Expected size of LCC')
    json_file.close()
    
with open('CC/cc1hour.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
    json_file.close()

plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.savefig("Graphs/largest_vs_expected_hour.png")
plt.show()

Day window

In [60]:
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)

with open('degree/degreewindows.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==86400000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x['mean'] =x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
    #print(x['expected'])
    #ax.fill_between(x['time'],np.array(x['mean'])-np.array(x['sd']), np.array(x['mean'])+np.array(x['sd']),alpha=0.3)
    ax.plot(x['time'],x['mean'],label='Expected size of LCC')
    json_file.close()
    
with open('CC/bigCC.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==86400000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
    json_file.close()
    
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()

Week window

In [61]:
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
ax.set_yscale('linear')

with open('degree/degreewindows.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==604800000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x['expected']=x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
    #print(x['expected'])
    x.plot(x='time',y='expected',ax=ax,label='Expected size of LCC')
    json_file.close()
    
with open('CC/bigCC.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==604800000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
    json_file.close()

plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()

Month window

In [62]:
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel('Time',fontsize=20)
ax.set_yscale('linear')

with open('degree/degreewindows.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==2592000000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x['expected']=x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
    #print(x['expected'])
    x.plot(x='time',y='expected',ax=ax,label='Expected size of LCC')
    json_file.close()
    
with open('CC/bigCC.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x=x[x['windowsize']==2592000000]
    x['index'] = pd.to_datetime(x['time'],unit='ms')
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x.set_index('index', inplace=True)
    x =x[start:end]
    x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
    json_file.close()
    
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()

Dynamics of top 20 users

For each window size and window, we obtain the top 20 users in terms of in-degree. Is it the case that some users dominate for long periods, or is it more dynamic?

Jaccard Similarity.

For two sets A and B, the Jaccard similarity is given by |A n B| / |A u B| measuring their percentage overlap.

In [46]:
def jaccard_similarity(list1, list2):
    list1, list2 = set(list1), set(list2)
    intersection_size = len(list1.intersection(list2))
    union_size=len(list1)+len(list2)-intersection_size
    if union_size==0:
        return 0.0
    else: return intersection_size/union_size

Stability over consecutive windows

For each window size, we calculate the Jaccard similarity between the pairs of consecutive non-overlapping windows' top 20 users. The lower plot shows the mean JS for each window size, but as the error bars overlap and as the number of datapoints drops vastly for each window theres nothing much concrete we can say about it yet.

In [49]:
def get_users(cell):
    if len(cell)==0:
        return []
    return pd.DataFrame(cell)['id']

def rank_jaccard_fast(x, y, title, scale):
    windows = [31536000000,2592000000,604800000,86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window', 'Hour Window']
    a4_dims = (11.7, 8.27)
    day_length=86400000
    fig, ax = plt.subplots(figsize=a4_dims)
    jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
    #plt.title(title, size=30)
    plt.xlabel(x, size=20)
    plt.ylabel(y, size=20)
    ax.set_yscale(scale)
    
    means=np.zeros(5)
    sds=np.zeros(5)
    
    with open('degree/degreewindows.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
        
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    index = 0
    for i in windows:
        n = round(i/day_length)
        y = degs[degs['windowsize'] == i]
        y = y.iloc[::n,:]
        y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
        y['nexttop'] = y['topusers'].shift(1)
        y['nexttop'][0]=[]
        y['jaccard']= y.apply(lambda x: 0.0, axis=1)
        y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
        means[index]=np.mean(y['jaccard'])
        sds[index]=np.std(y['jaccard'])
        jaccard['Time']=y['time']
        jaccard[labels[index]]=y['jaccard']
        
        ax.plot(y['time'],y['jaccard'], label=labels[index])
        index +=1
    
    jaccard.set_index('Time',inplace=True)
    
    with open('degree/degreehours.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
    
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    y = degs
    y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
    y['nexttop'] = y['topusers'].shift(1)
    y['nexttop'][0]=[]

    y['jaccard']= y.apply(lambda x: 0.0, axis=1)
    y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
    means[index]=np.mean(y['jaccard'])
    sds[index]=np.std(y['jaccard'])
    jaccard['Time']=y['time']
    jaccard[labels[index]]=y['jaccard']

    ax.plot(y['time'],y['jaccard'], label=labels[index])
    
    print(means)
    print(sds)

    plt.legend(fontsize=20, loc='lower right')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
    plt.tight_layout()
    plt.savefig('graphs/JaccardSimilarity.png')
    plt.show()
    
    fig, ax = plt.subplots(figsize=a4_dims)
    windows = [31536000000,2592000000,604800000,86400000,3600000]
    windows.reverse()

    new_means = np.flip(means)
    new_sds = np.flip(sds)
    
    labels.reverse()
    print(labels)
    ax.bar(np.arange(5), new_means, yerr=new_sds, align='center', alpha=0.5, ecolor='black')
    plt.xlabel('Window Size',size=20)
    ax.set_xticks(np.arange(5))
    ax.set_xticklabels(labels,size=20,rotation=30)
    plt.ylabel('Similarity between consecutive windows', size=20)
    plt.savefig('Graphs/JaccardMeanSD.png')
    plt.show()
    
    
rank_jaccard_fast("Date", "Similarity", "Jaccard similarity index of consecutive node rankings", 'linear')
[0.         0.44928546 0.4808423  0.35890023 0.21373447]
[0.         0.17999554 0.13703931 0.0882071  0.0853066 ]
['Hour Window', 'Day Window', 'Week Window', 'Month Window', 'Year Window']

Effect of shuffling timestamps

As with the connected components, we look at the effect of randomly shuffling the timestamps. It seems to have a "smoothing effect", suggesting that in the original data, many of the users who reach the top 20 may only do so for a short period of time.

In [50]:
def rank_jaccard_shuffled(x, y, title, scale):
    windows = [31536000000,2592000000,604800000,86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window', 'Hour Window']
    a4_dims = (11.7, 8.27)
    day_length=86400000
    fig, ax = plt.subplots(figsize=a4_dims)
    jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
    #plt.title(title, size=30)
    plt.xlabel(x, size=20)
    plt.ylabel(y, size=20)
    ax.set_yscale(scale)
    
    means=np.zeros(5)
    sds=np.zeros(5)
    
    with open('degree/degreesorted.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
        
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    index = 0
    for i in windows:
        n = round(i/day_length)
        y = degs[degs['windowsize'] == i]
        y = y.iloc[::n,:]
        y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
        y['nexttop'] = y['topusers'].shift(1)
        y['nexttop'][0]=[]
        y['jaccard']= y.apply(lambda x: 0.0, axis=1)
        y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
        means[index]=np.mean(y['jaccard'])
        sds[index]=np.std(y['jaccard'])
        jaccard['Time']=y['time']
        jaccard[labels[index]]=y['jaccard']
        
        ax.plot(y['time'],y['jaccard'], label=labels[index])
        index +=1
    
    jaccard.set_index('Time',inplace=True)
    
    with open('degree/degreesortedhour.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
    
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    y = degs
    y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
    y['nexttop'] = y['topusers'].shift(1)
    y['nexttop'][0]=[]

    y['jaccard']= y.apply(lambda x: 0.0, axis=1)
    y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
    means[index]=np.mean(y['jaccard'])
    sds[index]=np.std(y['jaccard'])
    jaccard['Time']=y['time']
    jaccard[labels[index]]=y['jaccard']

    ax.plot(y['time'],y['jaccard'], label=labels[index])
    
    print(means)
    print(sds)

    plt.legend(fontsize=20, loc='lower right')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
    plt.tight_layout()
    plt.savefig('graphs/JaccardSimilarityShuffled.png')
    plt.show()
    
    fig, ax = plt.subplots(figsize=a4_dims)
    windows = [31536000000,2592000000,604800000,86400000,3600000]
    windows.reverse()

    new_means = np.flip(means)
    new_sds = np.flip(sds)
    
    labels.reverse()
    print(labels)
    ax.bar(np.arange(5), new_means, yerr=new_sds, align='center', alpha=0.5, ecolor='black')
    plt.xlabel('Window Size',size=20)
    ax.set_xticks(np.arange(5))
    ax.set_xticklabels(labels,size=20,rotation=30)
    plt.ylabel('Similarity between consecutive windows', size=20)
    plt.savefig('Graphs/JaccardMeanSDShuffled.png')
    plt.show()
    
    
rank_jaccard_shuffled("Date", "Similarity", "Jaccard similarity index of consecutive node rankings", 'linear')
[0.         0.87052342 0.8693952  0.63457222 0.12076518]
[0.         0.28023836 0.16617867 0.14761091 0.07694133]
['Hour Window', 'Day Window', 'Week Window', 'Month Window', 'Year Window']

Comparison with reference point

We also compute the similarity (for the real dataset) between top 20 users in each window size and the top 20 all-time top users at a reference point around Nov 16. We find:

  • By the end of the time series, even in the largest window nearly all the top 20 users have been replaced, with the JS dropping to 0.1
  • Even at the same time as the reference point, there are different top 20 users in smaller window sizes.
In [12]:
def get_users(cell):
    if len(cell)==0:
        return []
    return pd.DataFrame(cell)['id']

def rank_jaccard_year_comparison(x, y, title, scale):
    windows = [31536000000,2592000000,604800000,86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
    #plt.title(title, size=30)
    plt.xlabel(x, size=30)
    plt.ylabel(y, size=30)
    ax.set_yscale(scale)
    
    with open('degree/degreewindows.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
        
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    #get year comparison:
    year = degs[degs['windowsize']==31536000000]
    jan17top = get_users(year['bestusers']['2017-01-01 23:00:00'])
    
    index=0
    for i in windows:
        print(i)
        y = degs[degs['windowsize'] == i]
        y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
    
        y['jaccard']= y.apply(lambda x: 0.0, axis=1)
        y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],jan17top), axis=1)
        
        ax.plot(y['time'],y['jaccard'], label=labels[index])
        index +=1        
        
    plt.legend(fontsize=20, loc='upper right')
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
    plt.tight_layout()
    plt.savefig('graphs/JaccardSimilarityJan17.png')
    plt.show()        
    
rank_jaccard_year_comparison('Date','Similarity','Similarity','linear')
31536000000
2592000000
604800000
86400000

For how long do users enter the top 20?

For each user who has ever been in the top 20 in any window, we count the number of windows (for each size) in which they appear in the top 20. For example, 90% of these users appear in the daily top 20 for less than 10% of the time period. Needs ironing out a bit I think as the explanation takes me ages to get my head around!

In [18]:
from collections import Counter
from itertools import chain

def how_many_windows(x,y,title,scale):
    
    windows = [31536000000,2592000000,604800000,86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title, size=30)
    plt.xlabel(x, size=20)
    plt.ylabel(y, size=20)
    ax.set_yscale(scale)
    
    with open('degree/degreewindows.json') as json_file:
        degs = json.load(json_file)
        json_file.close()
        
    degs = pd.DataFrame(degs['views'])
    degs['time'] = pd.to_datetime(degs['time'],unit='ms')
    degs['index'] = pd.to_datetime(degs['time'],unit='ms')
    degs.set_index('index', inplace=True)
    
    index = 0
    for i in windows:
        y = degs[degs['windowsize'] == i]
        total_windows = len(y)
        y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
        users_count = Counter(chain.from_iterable(set(row) for row in y['topusers']))
        meta = Counter(users_count.values())
        x1,y1 = zip(*sorted(meta.items()))
        x1 = np.array(x1)/total_windows
        y1 = np.cumsum(np.array(y1))/len(users_count.items())
        cdf = 1-y1
        ax.plot(x1,y1, label=labels[index])
        index+=1
        
    plt.legend()
    plt.tight_layout()
    plt.show()
    
how_many_windows('Proportion of windows','CDF of users in top 20 for that proportion','lol','linear')

OLD STUFF

In [38]:
windows = [31536000000,2592000000,604800000,86400000]
with open('bigCC.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x = x[x['total'] != 0]
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    plt.title('Biggest Connected Components % of graph')
    plt.xlabel('Date')
    plt.ylabel('Proportion of Graph')
    for i in windows:
        y = x[x['windowsize'] == i]
        y['mean'] = y['proportion'].rolling(window=4,center=False).mean()
        hours = str(int((i/3600000)))+" hour window"
        y.plot(x='time', y='mean',ax=ax, label=hours)
    plt.show()





windows = [31536000000,2592000000,604800000,86400000]
y=0
x=0
z=0
a=0
with open('bigCC.json') as json_file:
    x = json.load(json_file)
    x= pd.DataFrame(x['views'])
    x['time'] = pd.to_datetime(x['time'],unit='ms')
    x = x[x['total'] != 0]
    
with open('bigCC2.json') as json_file:
    y = json.load(json_file)
    y= pd.DataFrame(y['views'])
    y['time'] = pd.to_datetime(y['time'],unit='ms')
    y = y[y['total'] != 0]
    
with open('bigCC3.json') as json_file:
    z = json.load(json_file)
    z = pd.DataFrame(z['views'])
    z['time'] = pd.to_datetime(z['time'],unit='ms')
    z = z[z['total'] != 0]
    
with open('bigCC4.json') as json_file:
    a = json.load(json_file)
    a = pd.DataFrame(a['views'])
    a['time'] = pd.to_datetime(z['time'],unit='ms')
    a = a[a['total'] != 0]
    

a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
plt.title('Time taken to process view')
plt.xlabel('Date')
plt.ylabel('Time in Milliseconds')
x.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows")
y.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows with caching")
z.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows Parallel")
a.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows vote")


plt.show()
In [23]:
def percentagePlot(toPlot,title,x,y,scale,start,end):
    windows = [31536000000,2592000000,604800000, 86400000]
    labels = ['Year Window','Month Window','Week Window','Day Window']
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize=a4_dims)
    #plt.title(title,fontsize=30)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    ax.set_yscale(scale)
    
    color=next(ax._get_lines.prop_cycler)['color']

    with open('degree/degreewindows.json') as json_file:
        with open('degree/degreesorted.json') as json_2:
            cc1 = json.load(json_file)
            cc2 = json.load(json_2)
            cc1= pd.DataFrame(cc1['views'])
            cc2= pd.DataFrame(cc2['views'])
            cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
            cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
            cc1.set_index('index', inplace=True)
            cc2.set_index('index', inplace=True)
            cc1=cc1[start:end]
            cc2=cc2[start:end]
            index = 0
            for i in windows:
                color=next(ax._get_lines.prop_cycler)['color']
                y = cc1[cc1['windowsize'] == i]
                z = cc2[cc2['windowsize'] == i]
                y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
                z['avgdeg']= np.where(z['vertices']<1, z['vertices'], 2*z['edges']/z['vertices'])
                z['perc'] = np.where(z[toPlot]<1, 0, 100.0*y[toPlot]/z[toPlot])
                #y = y[y[toPlot] != 0]
                z.plot(x='time', y='perc',ax=ax, label=labels[index], color = color, linestyle="-", alpha=0.8)
                index +=1

    with open('degree/degreehours.json') as json_file:
        with open('degree/degreesortedhour.json') as json_2:
            color=next(ax._get_lines.prop_cycler)['color']
            x = json.load(json_file)
            y = json.load(json_2)
            x= pd.DataFrame(x['views'])
            y= pd.DataFrame(y['views'])
            x['index'] = pd.to_datetime(x['time'],unit='ms')
            y['index'] = pd.to_datetime(y['time'],unit='ms')
            x['time'] = pd.to_datetime(x['time'],unit='ms')
            y['time'] = pd.to_datetime(y['time'],unit='ms')
            x.set_index('index', inplace=True)
            y.set_index('index', inplace=True)
            x =x[start:end]
            y =y[start:end]
            x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
            y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
            y['perc'] = np.where(y[toPlot]<1, 0, 100.0*x[toPlot]/y[toPlot])
            x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
            y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
            x = x[x[toPlot] != 0]
            y = y[y[toPlot] != 0]
            y.plot(x='time', y='perc',ax=ax, color=color, label='Hour', alpha=0.3)
        
    #plt.axvline('2016-11-09',linestyle='')
    #plt.axvline('2017-08-11')
    #plt.axvline('2017-08-13')
    plt.legend(fontsize=20, loc='upper left')
    plt.xlabel('Date',fontsize=30)
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.savefig('graphs/'+toPlot+'.png')
    plt.show()
In [ ]:
percentagePlot('edges','Number of Vertices','Date','Percentage against shuffled','linear','2016-09-30 23:00:00','2018-10-30')